SatRday Chicago, April 27, 2019

Parfait Gasana

Data Analyst, Winston & Strawn
  @ParfaitG (GitHub)




R as an Extendable Environment



Relational Databases



Advantages of Relational Databases in Data Science



Programming Interfaces



R DBI Standard



Use Cases



CTA Bus/Rail Aggregation

library(DBI)
library(RPostgreSQL)

# CONNECT
conn <- dbConnect(RPostgreSQL::PostgreSQL(), host=config$pg_conn$host, dbname=config$pg_conn$name,
                  user=config$pg_conn$user, password=config$pg_conn$pwd, port=config$pg_conn$port)
# R AGGREGATION (MULTIPLE FUNCS)
# do.call(data.frame, 
#        aggregate(rides ~ route + routename, agg_csv,
#                  function(x) c(count=length(x), sum=sum(x), mean=mean(x),
#                                median=median(x), min=min(x), max=max(x))))

# POSTGRES AGGREGATION
sql <- 'SELECT rt.route_name, 
               COUNT(rd.rides) AS "count", 
               SUM(rd.rides) AS "sum", 
               AVG(rd.rides) AS "mean", 
               MEDIAN(rd.rides) AS "median",
               R_MEDIAN(rd.rides) AS "r_median",
               MIN(rd.rides) AS "min", 
               MAX(rd.rides) AS "max"
        FROM bus_routes rt
        INNER JOIN bus_rides rd ON rt.route_id = rd.route_id
        GROUP BY rt.route_name
        ORDER BY SUM(rd.rides) DESC
        LIMIT 10'

agg_sql <- dbGetQuery(conn, sql)

kable_styling(kable(agg_sql),
              bootstrap_options = c("striped", "hover", "condensed"))
route_name count sum mean median r_median min max
79th 6513 176238493 27059.50 27672 27672 7896 43081
Ashland 6513 156178845 23979.56 23676 23676 5570 45177
Chicago 6513 130620291 20055.32 21767 21767 4448 35893
Western 6513 129322319 19856.03 19553 19553 4985 37202
Cottage Grove 6513 128999766 19806.50 21181 21181 5489 31187
Belmont 6513 123065617 18895.38 20732 20732 3451 30025
Clark 6513 119459342 18341.68 19171 19171 3310 26896
King Drive 6513 119070638 18282.00 19402 19402 3993 29896
Halsted 6513 118284951 18161.36 19305 19305 2915 30605
Sheridan 6513 118178375 18145.00 18812 18812 2666 33236

Graphing

ggplot(agg_sql, aes(route_name, sum, fill=route_name)) + geom_col(position = "dodge") +
  labs(title="CTA Top 10 Bus Routes by Ridership", x="Year", y="Rides") +
  scale_y_continuous(expand = c(0, 0), label=comma, limit=c(0,2E8)) + guides(fill=FALSE) +
  scale_fill_manual(values=seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))


Calculation + Aggregation

sql <- 'WITH station_agg AS
         (SELECT DATE_PART(\'year\', r.ride_date)::integer AS "year",
                 r.station_id,
                 r.station_name,
                 COUNT(r.rides)::numeric(20,5) AS "count", 
                 SUM(r.rides)::numeric(20,5) AS "sum", 
                 AVG(r.rides)::numeric(20,5) AS "mean", 
                 MEDIAN(r.rides)::numeric(20,5) AS "median",
                 MIN(r.rides)::numeric(20,5) AS "min", 
                 MAX(r.rides)::numeric(20,5) AS "max"
          FROM rail_rides r
          GROUP BY DATE_PART(\'year\', r.ride_date),
                   r.station_id,
                   r.station_name
          ),
                   
      merge_rail AS
         (SELECT s.*, 
                 r.rail_line,
                 (s."sum" / COUNT(*) OVER(PARTITION BY s.station_id, "year")) AS rail_total
          FROM station_agg s
          INNER JOIN rail_stations r ON s.station_id = r.station_id
         )
         
      SELECT m."year", m.rail_line, SUM(m.rail_total)  AS rail_total
      FROM merge_rail m
      GROUP BY m."year", m.rail_line
      ORDER BY m.rail_line, m."year"'
  
agg_sql <- dbGetQuery(conn, sql)

kable_styling(kable(agg_sql[sample(1:nrow(agg_sql), 20),]),
              bootstrap_options = c("striped", "hover", "condensed"))
year rail_line rail_total
79 2007 pink 8144476
81 2009 pink 8804067
129 2003 red 54191628
52 2016 green 14596581
23 2005 brown 15526814
16 2016 blue 47211306
100 2010 purple 2076566
143 2017 red 63047814
135 2009 red 59752633
7 2007 blue 34996201
17 2017 blue 46113140
82 2010 pink 8872230
6 2006 blue 36988999
65 2011 orange 13065864
36 2018 brown 18005370
109 2001 purple_exp 11168731
48 2012 green 14658089
55 2001 orange 11012243
61 2007 orange 12469954
92 2002 purple 2000502

Graphing

cta_palette <- c(blue="#00A1DE", brown="#62361B", green="#009B3A", orange="#F9461C", pink="#E27EA6",
                 purple="#522398", purple_exp="#8059BA", red="#C60C30", yellow="#F9E300")

ggplot(agg_sql, aes(year, rail_total, color=rail_line)) + 
  geom_line(stat="identity") + geom_point(stat="identity") +
  labs(title="CTA Rail Ridership By Year", x="Year", y="Rides") +
  scale_x_continuous("year", breaks=unique(agg_sql$year)) +
  scale_y_continuous(expand = c(0, 0), label=comma) +
  scale_color_manual(values = cta_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))


Modeling

CREATE MATERIALIZED VIEW Rail_Model_Data AS
    SELECT r.id, r.station_id, r.station_name, r.ride_date, r.day_type, r.rides AS raw, 
          (r.rides / COUNT(*) OVER(PARTITION BY r.station_id, r.ride_date)) AS rides,
          CASE 
               WHEN r.normalized_date BETWEEN '2099-01-01' AND '2099-03-19' THEN 'winter'
               WHEN r.normalized_date BETWEEN '2099-03-20' AND '2099-06-19' THEN 'spring'
               WHEN r.normalized_date BETWEEN '2099-06-20' AND '2099-09-19' THEN 'summer'
               WHEN r.normalized_date BETWEEN '2099-09-20' AND '2099-12-19' THEN 'fall'
               WHEN r.normalized_date BETWEEN '2099-12-20' AND '2099-12-31' THEN 'winter'
               ELSE NULL
           END As season,        
           REPLACE(REPLACE((regexp_split_to_array(s.location, '\s+'))[1], ',', ''), '(', '')::numeric AS latitude,
           REPLACE((regexp_split_to_array(s.location, '\s+'))[2], ')', '')::numeric AS longitude,
           s.rail_line, s.ada, s.direction,
           ue.ue_rate, g.gas_price, w.avg_temp, w.precipitation, w.snow_depth
    FROM 
       (
        SELECT id, station_id, station_name, day_type, rides, ride_date, 
               ride_date + (2099 - date_part('year', ride_date)  ||' year')::interval as normalized_date
        FROM rail_rides
       )r
    INNER JOIN rail_stations s ON s.station_id = r.station_id
    INNER JOIN unemployment_rates ue ON ue.ue_date = r.ride_date
    INNER JOIN gas_prices g ON g.gas_date = r.ride_date
    INNER JOIN weather_data w ON w.weather_date = r.ride_date
    ORDER BY r.ride_date, r.station_id;
    
REFRESH MATERIALIZED VIEW Rail_Model_Data;
rail_model_data <- dbGetQuery(conn, "SELECT * FROM rail_model_data")

scroll_box(kable_styling(kable(head(rail_model_data)),
                        bootstrap_options = c("striped", "hover", "condensed")), width = "100%")
id station_id station_name ride_date day_type raw rides season latitude longitude rail_line ada direction ue_rate gas_price avg_temp precipitation snow_depth
62 40010 Austin-Forest Park 2001-01-01 U 290 290 winter 41.87085 -87.77681 blue 0 W | E 5.3 1.487 14.5 0 17
61 40020 Harlem-Lake 2001-01-01 U 633 633 winter 41.88685 -87.80318 green 1 E | W 5.3 1.487 14.5 0 17
50 40030 Pulaski-Lake 2001-01-01 U 483 483 winter 41.88541 -87.72540 green 1 W | E 5.3 1.487 14.5 0 17
109 40040 Quincy/Wells 2001-01-01 U 374 93 winter 41.87872 -87.63374 pink 0 N 5.3 1.487 14.5 0 17
109 40040 Quincy/Wells 2001-01-01 U 374 93 winter 41.87872 -87.63374 brown 0 S 5.3 1.487 14.5 0 17
109 40040 Quincy/Wells 2001-01-01 U 374 93 winter 41.87872 -87.63374 orange 0 N 5.3 1.487 14.5 0 17
model <- lm(rides ~ day_type + season + latitude + longitude + rail_line + 
                    ue_rate + gas_price + avg_temp + precipitation + snow_depth, 
            data = rail_model_data)

# ANOVA
kable_styling(kable(xtable(anova(model))),
              bootstrap_options = c("striped", "hover", "condensed"))
Df Sum Sq Mean Sq F value Pr(>F)
day_type 2 445663599687 222831799844 64679.229601 0.0000000
season 3 12065643417 4021881139 1167.392508 0.0000000
latitude 1 86867071786 86867071786 25214.064082 0.0000000
longitude 1 319342686 319342686 92.692510 0.0000000
rail_line 8 1832581552014 229072694002 66490.713539 0.0000000
ue_rate 1 1655357962 1655357962 480.484732 0.0000000
gas_price 1 22101747909 22101747909 6415.260428 0.0000000
avg_temp 1 3204630785 3204630785 930.177158 0.0000000
precipitation 1 878964701 878964701 255.128576 0.0000000
snow_depth 1 8106972 8106972 2.353132 0.1250319
Residuals 1038214 3576837505172 3445183 NA NA
# POINT ESTIMATES
kable_styling(kable(xtable(summary(model))),
              bootstrap_options = c("striped", "hover", "condensed"))
Estimate Std. Error t value Pr(>|t|)
(Intercept) -67333.240996 3539.2418169 -19.024764 0.0000000
day_typeU -454.935197 6.6213391 -68.707430 0.0000000
day_typeW 1159.154756 5.2660141 220.119950 0.0000000
seasonspring -180.266785 5.3603614 -33.629595 0.0000000
seasonsummer -179.982024 6.5944919 -27.292781 0.0000000
seasonwinter -160.120620 6.1885199 -25.873815 0.0000000
latitude -3954.668735 36.5219229 -108.282051 0.0000000
longitude -2679.792012 43.6084183 -61.451254 0.0000000
rail_linebrown -1187.766815 7.2025790 -164.908543 0.0000000
rail_linegreen -2076.775862 6.8343576 -303.872871 0.0000000
rail_lineorange -1082.768528 8.2804788 -130.761584 0.0000000
rail_linepink -2215.792821 7.3581389 -301.134954 0.0000000
rail_linepurple -2043.636304 11.1396088 -183.456738 0.0000000
rail_linepurple_exp -1558.043094 7.6564149 -203.495122 0.0000000
rail_linered 1636.661666 6.9037116 237.069820 0.0000000
rail_lineyellow -1418.136925 17.5558352 -80.778665 0.0000000
ue_rate -7.639672 0.9665412 -7.904135 0.0000000
gas_price 188.159773 2.4375323 77.192729 0.0000000
avg_temp 5.141775 0.1658352 31.005329 0.0000000
precipitation -84.092077 5.2552649 -16.001492 0.0000000
snow_depth 1.848654 1.2051261 1.533992 0.1250319
graph_data <- data.frame(param = names(model$coefficients[-1]),
                         value = model$coefficients[-1],
                         row.names = NULL)

ggplot(graph_data) + geom_col(aes(x=param, y=value, fill=param), position = "dodge") +
  labs(title="CTA System Rail Regression Point Estimates", x="Parameters", y="Value") +
  guides(fill=FALSE) + ylim(-4000, 2000) + 
  scale_fill_manual(values = seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=45, vjust=0.95, hjust=0.95))

# DISCONNECT FROM DATABASE
dbDisconnect(conn)
## [1] TRUE


Divvy Time and Distance Calculation

options(connectionObserver = NULL)

library(odbc)

conn <- dbConnect(odbc::odbc(),
                  instance = config$db2_conn$instance,
                  driver = config$db2_conn$driver,
                  database = config$db2_conn$database,
                  server = config$db2_conn$server,
                  hostname = config$db2_conn$hostname,
                  port = config$db2_conn$port,
                  uid = config$db2_conn$uid,
                  pwd = config$db2_conn$pwd,
                  LongDataCompat = 1)
day_df <- dbGetQuery(conn, "SELECT t.START_TIME, t.STOP_TIME,
                                   t.TRIP_DURATION
                            FROM TRIPS t
                            WHERE DATE(t.START_TIME) = TO_DATE('2015-09-23', 'YYYY-MM-DD')
                            ORDER BY t.START_TIME")

kable_styling(kable(head(day_df, 20)),
              bootstrap_options = c("striped", "hover", "condensed"))
START_TIME STOP_TIME TRIP_DURATION
2015-09-23 00:00:00 2015-09-23 00:07:00 414
2015-09-23 00:00:00 2015-09-23 00:03:00 183
2015-09-23 00:01:00 2015-09-23 00:09:00 450
2015-09-23 00:01:00 2015-09-23 00:24:00 1350
2015-09-23 00:01:00 2015-09-23 00:07:00 402
2015-09-23 00:02:00 2015-09-23 00:15:00 793
2015-09-23 00:03:00 2015-09-23 00:27:00 1435
2015-09-23 00:04:00 2015-09-23 00:06:00 134
2015-09-23 00:05:00 2015-09-23 00:19:00 820
2015-09-23 00:06:00 2015-09-23 00:15:00 526
2015-09-23 00:07:00 2015-09-23 00:15:00 506
2015-09-23 00:08:00 2015-09-23 00:15:00 425
2015-09-23 00:09:00 2015-09-23 00:32:00 1372
2015-09-23 00:10:00 2015-09-23 00:24:00 881
2015-09-23 00:10:00 2015-09-23 00:35:00 1445
2015-09-23 00:10:00 2015-09-23 00:26:00 935
2015-09-23 00:11:00 2015-09-23 00:23:00 717
2015-09-23 00:13:00 2015-09-23 00:19:00 382
2015-09-23 00:14:00 2015-09-23 00:15:00 66
2015-09-23 00:16:00 2015-09-23 00:38:00 1316
ggplot(day_df, aes(START_TIME, TRIP_DURATION)) + geom_line(color=seaborn_palette[1]) +
  xlab("Start Time") + ylab("Trip Duration")

agg_sql <- dbGetQuery(conn, "WITH sub AS 
                                (SELECT t.FROM_STATION_ID, t.FROM_STATION_NAME,
                                        t.FROM_LATITUDE, t.FROM_LONGITUDE,
                                        ROUND(CASE 
                                                   WHEN t.TO_LATITUDE IS NOT NULL AND t.FROM_LATITUDE IS NOT NULL
                                                   THEN
                                                        3963.1 * (2 * ATAN(1) - 
                                                                  ASIN(SIN(RADIANS(t.FROM_LATITUDE)) * SIN(RADIANS(t.TO_LONGITUDE)) + 
                                                                       COS(RADIANS(t.FROM_LATITUDE)) * COS(RADIANS(t.TO_LONGITUDE)) * 
                                                                       COS(RADIANS(t.TO_LONGITUDE - t.FROM_LONGITUDE))
                                                                      ) 
                                                                 )
                                                   ELSE NULL
                                              END, 4) AS TRIP_DISTANCE
                                 FROM TRIPS t
                                 WHERE DATE(t.START_TIME) = TO_DATE('2015-09-23', 'YYYY-MM-DD'))
                            
                            SELECT FROM_STATION_ID, FROM_STATION_NAME, 
                                   FROM_LATITUDE, FROM_LONGITUDE,
                                   MIN(TRIP_DISTANCE) AS MIN_DISTANCE,
                                   MAX(TRIP_DISTANCE) AS MAX_DISTANCE
                            
                            FROM sub 
                            GROUP BY FROM_STATION_ID, FROM_STATION_NAME, 
                                     FROM_LATITUDE, FROM_LONGITUDE
                            ORDER BY MAX(TRIP_DISTANCE) DESC
                            FETCH FIRST 10 ROWS ONLY;")

kable_styling(kable(agg_sql), bootstrap_options = c("striped", "hover", "condensed"))
FROM_STATION_ID FROM_STATION_NAME FROM_LATITUDE FROM_LONGITUDE MIN_DISTANCE MAX_DISTANCE
467 Western Ave & Lunt Ave 42.00859 -87.69049 8969.148 8971.314
494 Kedzie Ave & Bryn Mawr Ave 41.98240 -87.70892 8969.882 8971.135
447 Glenwood Ave & Morse Ave 42.00797 -87.66550 8967.513 8971.132
466 Ridge Blvd & Touhy Ave 42.01213 -87.68291 8969.393 8970.895
294 Broadway & Berwyn Ave 41.97835 -87.65975 8963.826 8970.855
479 Drake Ave & Montrose Ave 41.96115 -87.71657 8969.369 8970.804
469 St. Louis Ave & Balmoral Ave 41.98039 -87.71611 8967.694 8970.700
450 Warren Park West 42.00201 -87.68973 8970.580 8970.580
451 Sheridan Rd & Loyola Ave 42.00104 -87.66120 8966.526 8970.579
325 Clark St & Winnemac Ave 41.97339 -87.66836 8964.613 8970.512

Mapping

divvy_from <- dbGetQuery(conn, "SELECT t.FROM_STATION_NAME, 
                                       t.FROM_LATITUDE, 
                                       t.FROM_LONGITUDE,
                                       SUM(t.TRIP_DURATION) AS Total_Duration,
                                       MIN(t.TRIP_DURATION) AS Min_Duration,
                                       AVG(t.TRIP_DURATION) AS Avg_Duration,
                                       MEDIAN(t.TRIP_DURATION) AS Median_Duration,
                                       MAX(t.TRIP_DURATION) AS Max_Duration
                                       
                                FROM TRIPS t
                                GROUP BY t.FROM_STATION_NAME,
                                         t.FROM_LATITUDE, 
                                         t.FROM_LONGITUDE
                                ORDER BY SUM(t.TRIP_DURATION) DESC
                                FETCH FIRST 10 ROWS ONLY;")

kable_styling(kable(divvy_from), bootstrap_options = c("striped", "hover", "condensed"))
FROM_STATION_NAME FROM_LATITUDE FROM_LONGITUDE TOTAL_DURATION MIN_DURATION AVG_DURATION MEDIAN_DURATION MAX_DURATION
Lake Shore Dr & Monroe St 41.88096 -87.61674 444455108 60 1803 1347 6403880
Streeter Dr & Grand Ave 41.89228 -87.61204 431638473 60 1956 1417 5444590
Michigan Ave & Oak St 41.90096 -87.62378 351053592 60 1815 1300 2549130
Millennium Park 41.88103 -87.62408 347331136 60 1820 1241 6785180
Theater on the Lake 41.92628 -87.63083 330660276 60 1524 1285 740561
Lake Shore Dr & North Blvd 41.91172 -87.62680 286215561 60 1452 1173 336578
Streeter Dr & Illinois St 41.89107 -87.61220 222454638 60 1615 1320 84073
Canal St & Adams St 41.87926 -87.63990 184661822 60 837 614 9961130
Michigan Ave & Washington St 41.88389 -87.62465 179427697 60 1160 649 1876990
Shedd Aquarium 41.86722 -87.61535 178996820 60 1675 1307 1389890
ggplot(transform(divvy_from, FROM_STATION_NAME=gsub("&", "\n&", FROM_STATION_NAME)),
                 aes(FROM_STATION_NAME, TOTAL_DURATION, fill=FROM_STATION_NAME)) +
  geom_col(position = "dodge") +
  labs(title="Divvy Top 10 Origination Stations", x="Station", y="Trip Duration (seconds)") +
  scale_y_continuous(expand = c(0, 0), label=comma) + guides(fill=FALSE) +
  scale_fill_manual(values=seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))

divvy_to <- dbGetQuery(conn, "SELECT t.TO_STATION_NAME, 
                                     t.TO_LATITUDE, 
                                     t.TO_LONGITUDE,
                                     SUM(t.TRIP_DURATION) AS Total_Duration,
                                     MIN(t.TRIP_DURATION) AS Min_Duration,
                                     AVG(t.TRIP_DURATION) AS Avg_Duration,
                                     MEDIAN(t.TRIP_DURATION) AS Median_Duration,
                                     MAX(t.TRIP_DURATION) AS Max_Duration
                                   
                            FROM TRIPS t
                            WHERE TO_STATION_NAME <> 'DIVVY Map Frame B/C Station'
                            GROUP BY t.TO_STATION_NAME,
                                     t.TO_LATITUDE, 
                                     t.TO_LONGITUDE
                            ORDER BY SUM(t.TRIP_DURATION) DESC
                            FETCH FIRST 10 ROWS ONLY;")

kable_styling(kable(divvy_to), bootstrap_options = c("striped", "hover", "condensed"))
TO_STATION_NAME TO_LATITUDE TO_LONGITUDE TOTAL_DURATION MIN_DURATION AVG_DURATION MEDIAN_DURATION MAX_DURATION
Streeter Dr & Grand Ave 41.89228 -87.61204 469992965 60 1914 1432 297905
Lake Shore Dr & Monroe St 41.88096 -87.61674 396754905 60 1702 1318 731920
Theater on the Lake 41.92628 -87.63083 370722559 60 1598 1374 383963
Michigan Ave & Oak St 41.90096 -87.62378 369016876 60 1782 1306 2549130
Millennium Park 41.88103 -87.62408 344723608 60 1637 1157 566401
Lake Shore Dr & North Blvd 41.91172 -87.62680 337381528 60 1524 1295 269506
Streeter Dr & Illinois St 41.89107 -87.61220 264397622 60 1611 1325 85368
Michigan Ave & Washington St 41.88389 -87.62465 158901379 60 1003 572 3054250
Shedd Aquarium 41.86722 -87.61535 158356779 60 1563 1307 184015
Adler Planetarium 41.86610 -87.60727 153361126 60 1635 1357 524315
ggplot(transform(divvy_to, TO_STATION_NAME=gsub("&", "\n&", TO_STATION_NAME)), 
       aes(TO_STATION_NAME, TOTAL_DURATION, fill=TO_STATION_NAME)) +
  geom_col(position = "dodge") +
  labs(title="Divvy Top 10 Destination Stations", x="Station", y="Trip Duration (seconds)") +
  scale_y_continuous(expand = c(0, 0), label=comma) + guides(fill=FALSE) +
  scale_fill_manual(values=seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))

library(jsonlite)

x <- toJSON(divvy_from[c("FROM_STATION_NAME", "FROM_LATITUDE", "FROM_LONGITUDE")], pretty=TRUE)

# EXPORT TO FILE
fileConn <- file("Divvy_From_Coords.json")
writeLines(x, fileConn)
close(fileConn)

x
## [
##   {
##     "FROM_STATION_NAME": "Lake Shore Dr & Monroe St",
##     "FROM_LATITUDE": 41.881,
##     "FROM_LONGITUDE": -87.6167
##   },
##   {
##     "FROM_STATION_NAME": "Streeter Dr & Grand Ave",
##     "FROM_LATITUDE": 41.8923,
##     "FROM_LONGITUDE": -87.612
##   },
##   {
##     "FROM_STATION_NAME": "Michigan Ave & Oak St",
##     "FROM_LATITUDE": 41.901,
##     "FROM_LONGITUDE": -87.6238
##   },
##   {
##     "FROM_STATION_NAME": "Millennium Park",
##     "FROM_LATITUDE": 41.881,
##     "FROM_LONGITUDE": -87.6241
##   },
##   {
##     "FROM_STATION_NAME": "Theater on the Lake",
##     "FROM_LATITUDE": 41.9263,
##     "FROM_LONGITUDE": -87.6308
##   },
##   {
##     "FROM_STATION_NAME": "Lake Shore Dr & North Blvd",
##     "FROM_LATITUDE": 41.9117,
##     "FROM_LONGITUDE": -87.6268
##   },
##   {
##     "FROM_STATION_NAME": "Streeter Dr & Illinois St",
##     "FROM_LATITUDE": 41.8911,
##     "FROM_LONGITUDE": -87.6122
##   },
##   {
##     "FROM_STATION_NAME": "Canal St & Adams St",
##     "FROM_LATITUDE": 41.8793,
##     "FROM_LONGITUDE": -87.6399
##   },
##   {
##     "FROM_STATION_NAME": "Michigan Ave & Washington St",
##     "FROM_LATITUDE": 41.8839,
##     "FROM_LONGITUDE": -87.6246
##   },
##   {
##     "FROM_STATION_NAME": "Shedd Aquarium",
##     "FROM_LATITUDE": 41.8672,
##     "FROM_LONGITUDE": -87.6154
##   }
## ]
x <- toJSON(divvy_to[c("TO_STATION_NAME", "TO_LATITUDE", "TO_LONGITUDE")], pretty=TRUE)

# EXPORT TO FILE
fileConn <- file("Divvy_To_Coords.json")
writeLines(x, fileConn)
close(fileConn)

x
## [
##   {
##     "TO_STATION_NAME": "Streeter Dr & Grand Ave",
##     "TO_LATITUDE": 41.8923,
##     "TO_LONGITUDE": -87.612
##   },
##   {
##     "TO_STATION_NAME": "Lake Shore Dr & Monroe St",
##     "TO_LATITUDE": 41.881,
##     "TO_LONGITUDE": -87.6167
##   },
##   {
##     "TO_STATION_NAME": "Theater on the Lake",
##     "TO_LATITUDE": 41.9263,
##     "TO_LONGITUDE": -87.6308
##   },
##   {
##     "TO_STATION_NAME": "Michigan Ave & Oak St",
##     "TO_LATITUDE": 41.901,
##     "TO_LONGITUDE": -87.6238
##   },
##   {
##     "TO_STATION_NAME": "Millennium Park",
##     "TO_LATITUDE": 41.881,
##     "TO_LONGITUDE": -87.6241
##   },
##   {
##     "TO_STATION_NAME": "Lake Shore Dr & North Blvd",
##     "TO_LATITUDE": 41.9117,
##     "TO_LONGITUDE": -87.6268
##   },
##   {
##     "TO_STATION_NAME": "Streeter Dr & Illinois St",
##     "TO_LATITUDE": 41.8911,
##     "TO_LONGITUDE": -87.6122
##   },
##   {
##     "TO_STATION_NAME": "Michigan Ave & Washington St",
##     "TO_LATITUDE": 41.8839,
##     "TO_LONGITUDE": -87.6246
##   },
##   {
##     "TO_STATION_NAME": "Shedd Aquarium",
##     "TO_LATITUDE": 41.8672,
##     "TO_LONGITUDE": -87.6154
##   },
##   {
##     "TO_STATION_NAME": "Adler Planetarium",
##     "TO_LATITUDE": 41.8661,
##     "TO_LONGITUDE": -87.6073
##   }
## ]

# DISCONNECT FROM DATABASE
dbDisconnect(conn)


Metra Data Normalization and Testing

library(RSQLite)

conn <- dbConnect(RSQLite::SQLite(), dbname=config$sqlite_conn$database)

By Year

sql <- "SELECT l.Line, 
               t.Line AS Short,
               CAST(strftime('%Y', Report_Month) AS INT) as Year,
               SUM(t.Late_Trains) AS Total_Late_Trains,
               AVG(t.Late_Trains) AS Avg_Late_Trains
        FROM Trains t
        INNER JOIN Lines l ON t.LineID = l.ID
        WHERE l.Line <> 'SYSTEM'
        GROUP BY l.Line,
                 t.Line,
                 strftime('%Y', t.Report_Month)"

agg_sql <- dbGetQuery(conn, sql)

kable_styling(kable(agg_sql[sample(1:nrow(agg_sql), 20),]),
              bootstrap_options = c("striped", "hover", "condensed"))
Line Short Year Total_Late_Trains Avg_Late_Trains
52 Milwaukee District / North Milw-N 2010 2860 39.722222
95 SouthWest Service SWS 2013 1035 17.250000
28 Metra Electric Blue Island Elec-SC 2016 722 10.027778
68 Milwaukee District / West Milw-W 2016 2465 34.236111
66 Milwaukee District / West Milw-W 2014 3175 44.097222
123 Union Pacific / West UP-W 2011 4298 59.694444
26 Metra Electric Blue Island Elec-SC 2014 973 13.513889
31 Metra Electric Main Line Elec-ML 2009 2027 28.152778
16 Heritage Corridor Heritage 2014 393 10.342105
86 Rock Island District RI 2014 3521 48.902778
22 Metra Electric Blue Island Elec-SC 2010 692 9.611111
74 North Central Service NCS 2012 1272 26.500000
115 Union Pacific / Northwest UP-NW 2013 2856 39.666667
4 BNSF Railway BNSF 2012 3035 42.152778
104 Union Pacific / North UP-N 2012 2004 27.833333
117 Union Pacific / Northwest UP-NW 2015 1904 28.848485
50 Metra Electric South Chicago Elec-BI 2018 503 8.383333
90 Rock Island District RI 2018 3053 42.402778
92 SouthWest Service SWS 2010 1385 23.083333
57 Milwaukee District / North Milw-N 2015 2432 36.848485
ggplot(agg_sql, aes(Year, Total_Late_Trains, color=Line)) + 
  geom_line(stat="identity") +
  labs(title="Metra Late Trains by Line", x="Year", y="Late Trains") +
  scale_x_continuous("year", breaks=unique(agg_sql$Year)) +
  scale_y_continuous(expand = c(0, 0), label=comma) + guides(fill=FALSE) +
  scale_color_manual(values=seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))


Statistical Testing

T-Tests

lines_list <- split(agg_sql, agg_sql$Short)

nms <- lapply(combn(unique(agg_sql$Short), 2, simplify=FALSE), function(i) paste(i, collapse="_"))

res <- t(sapply(combn(unique(agg_sql$Short), 2, simplify=FALSE), function(i) {
  t <- t.test(lines_list[[i[1]]]$Total_Late_Trains, lines_list[[i[2]]]$Total_Late_Trains)
  c(statistic = t$statistic, p_value = t$p.value)
}))

row.names(res) <- nms

scroll_box(kable_styling(kable(res), bootstrap_options = c("striped", "hover", "condensed")),
           height="800px")
statistic.t p_value
BNSF_Heritage 9.8494756 0.0000034
BNSF_Elec-SC 8.7158417 0.0000076
BNSF_Elec-ML 6.1374028 0.0000957
BNSF_Elec-BI 9.3920454 0.0000054
BNSF_Milw-N 2.8910097 0.0111757
BNSF_Milw-W 4.9595192 0.0002727
BNSF_NCS 7.9789334 0.0000143
BNSF_RI 4.5415218 0.0006351
BNSF_SWS 7.5383993 0.0000207
BNSF_UP-N 4.9702928 0.0001637
BNSF_UP-NW 5.4107001 0.0002478
BNSF_UP-W 3.7401929 0.0021030
Heritage_Elec-SC -5.2621722 0.0000918
Heritage_Elec-ML -12.0629835 0.0000001
Heritage_Elec-BI -3.4987332 0.0027396
Heritage_Milw-N -10.3277589 0.0000017
Heritage_Milw-W -9.1513465 0.0000037
Heritage_NCS -7.7996898 0.0000020
Heritage_RI -11.0240760 0.0000006
Heritage_SWS -8.6861313 0.0000010
Heritage_UP-N -6.3532760 0.0001028
Heritage_UP-NW -13.6677426 0.0000000
Heritage_UP-W -9.5307864 0.0000033
Elec-SC_Elec-ML -7.6930971 0.0000013
Elec-SC_Elec-BI 3.0553272 0.0090725
Elec-SC_Milw-N -8.4524134 0.0000058
Elec-SC_Milw-W -6.7021299 0.0000291
Elec-SC_NCS -2.6856024 0.0153183
Elec-SC_RI -8.3005633 0.0000030
Elec-SC_SWS -3.9255453 0.0011167
Elec-SC_UP-N -4.5812945 0.0009268
Elec-SC_UP-NW -9.4434609 0.0000001
Elec-SC_UP-W -7.5259529 0.0000146
Elec-ML_Elec-BI 10.7051161 0.0000005
Elec-ML_Milw-N -4.3660738 0.0008571
Elec-ML_Milw-W -1.6151844 0.1279065
Elec-ML_NCS 5.2419772 0.0000733
Elec-ML_RI -2.7359672 0.0152246
Elec-ML_SWS 3.8518590 0.0012327
Elec-ML_UP-N -0.7261232 0.4813509
Elec-ML_UP-NW -1.8073859 0.0875269
Elec-ML_UP-W -3.2060881 0.0069770
Elec-BI_Milw-N -9.6182342 0.0000038
Elec-BI_Milw-W -8.2352811 0.0000119
Elec-BI_NCS -5.9428602 0.0000643
Elec-BI_RI -10.0649428 0.0000019
Elec-BI_SWS -7.0205384 0.0000176
Elec-BI_UP-N -5.6282342 0.0002825
Elec-BI_UP-NW -12.4136552 0.0000001
Elec-BI_UP-W -8.7676538 0.0000080
Milw-N_Milw-W 2.6700024 0.0162767
Milw-N_NCS 7.2630725 0.0000175
Milw-N_RI 2.0467572 0.0573187
Milw-N_SWS 6.5511394 0.0000355
Milw-N_UP-N 2.7933483 0.0120074
Milw-N_UP-NW 3.2584554 0.0063262
Milw-N_UP-W 1.0801680 0.2944043
Milw-W_NCS 5.1854922 0.0002195
Milw-W_RI -0.8225248 0.4216052
Milw-W_SWS 4.2973444 0.0008763
Milw-W_UP-N 0.4888439 0.6313060
Milw-W_UP-NW 0.2804970 0.7829139
Milw-W_UP-W -1.5653195 0.1356000
NCS_RI -6.6347574 0.0000189
NCS_SWS -1.3501689 0.1939515
NCS_UP-N -3.4515401 0.0055512
NCS_UP-NW -7.0452631 0.0000029
NCS_UP-W -6.2606259 0.0000586
RI_SWS 5.6476491 0.0000687
RI_UP-N 1.1936645 0.2499458
RI_UP-NW 1.3014342 0.2118233
RI_UP-W -0.8818209 0.3903462
SWS_UP-N -2.7957833 0.0169999
SWS_UP-NW -5.6470712 0.0000295
SWS_UP-W -5.5094534 0.0001466
UP-N_UP-NW -0.3290665 0.7474460
UP-N_UP-W -1.8187654 0.0857421
UP-NW_UP-W -2.0429687 0.0613004

By Month

sql <- "SELECT l.ID,
               l.Line, 
               t.Line AS Short,
               CAST(strftime('%m', Report_Month) AS INT) as Month_Num,
               SUM(t.Late_Trains) AS Total_Late_Trains,
               AVG(t.Late_Trains) AS Avg_Late_Trains
        FROM Trains t
        INNER JOIN Lines l ON t.LineID = l.ID
        WHERE l.Line <> 'SYSTEM'
        GROUP BY l.ID,
                 l.Line,
                 t.Line,
                 CAST(strftime('%m', Report_Month) AS INT)
        ORDER BY l.ID,
                 CAST(strftime('%m', Report_Month) AS INT)"

agg_sql <- transform(dbGetQuery(conn, sql), Month = factor(month.abb[Month_Num], levels=month.abb))

kable_styling(kable(agg_sql[sample(1:nrow(agg_sql), 20),]), 
              bootstrap_options = c("striped", "hover", "condensed"))
ID Line Short Month_Num Total_Late_Trains Avg_Late_Trains Month
29 3 Metra Electric Main Line Elec-ML 5 1120 18.66667 May
22 2 Heritage Corridor Heritage 10 339 10.27273 Oct
87 8 North Central Service NCS 3 945 23.62500 Mar
111 10 SouthWest Service SWS 3 888 17.76000 Mar
62 6 Milwaukee District / North Milw-N 2 3327 55.45000 Feb
66 6 Milwaukee District / North Milw-N 6 3329 55.48333 Jun
35 3 Metra Electric Main Line Elec-ML 11 1339 22.31667 Nov
27 3 Metra Electric Main Line Elec-ML 3 1202 20.03333 Mar
131 11 Union Pacific / North UP-N 11 1652 27.53333 Nov
42 4 Metra Electric South Chicago Elec-BI 6 715 14.30000 Jun
9 1 BNSF Railway BNSF 9 3431 57.18333 Sep
33 3 Metra Electric Main Line Elec-ML 9 1516 25.26667 Sep
115 10 SouthWest Service SWS 7 1186 23.25490 Jul
149 13 Union Pacific / West UP-W 5 2436 40.60000 May
70 6 Milwaukee District / North Milw-N 10 2392 39.86667 Oct
24 2 Heritage Corridor Heritage 12 366 10.16667 Dec
143 12 Union Pacific / Northwest UP-NW 11 2071 34.51667 Nov
125 11 Union Pacific / North UP-N 5 1623 27.05000 May
37 4 Metra Electric South Chicago Elec-BI 1 780 15.60000 Jan
155 13 Union Pacific / West UP-W 11 2076 34.60000 Nov
ggplot(agg_sql, aes(Month, Total_Late_Trains, fill=Line)) + 
  geom_col(position = "dodge") +
  labs(title="Metra Late Trains by Month", x="Year", y="Late Trains") +
  scale_x_discrete("Month", breaks=unique(agg_sql$Month)) +
  scale_y_continuous(expand = c(0, 0), label=comma) + guides(fill=FALSE) +
  scale_fill_manual(values=seaborn_palette) +
  facet_wrap(~Line, ncol=3, scales="free_y") +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))

Correlations

sql <- "SELECT CAST(strftime('%Y', c.Report_Month) AS INT) AS Year,
               SUM(CASE WHEN c.Late_Cause = 'Accident' THEN c.Late_Trains ELSE NULL END) AS [Accident],
               SUM(CASE WHEN c.Late_Cause = 'Catenary Failure' THEN c.Late_Trains ELSE NULL END) AS [Catenary],               
               SUM(CASE WHEN c.Late_Cause = 'Freight Interference - Peak' THEN c.Late_Trains ELSE NULL END) AS [Freight Interference Peak],
               SUM(CASE WHEN c.Late_Cause = 'Freight Interference - Off-Peak' THEN c.Late_Trains ELSE NULL END) AS [Freight Interference Off Peak],  
               SUM(CASE WHEN c.Late_Cause = 'Human Error' THEN c.Late_Trains ELSE NULL END) AS [Human Error],                     
               SUM(CASE WHEN c.Late_Cause = 'Lift Deployment' THEN c.Late_Trains ELSE NULL END) AS [Lift Deployment],                
               SUM(CASE WHEN c.Late_Cause = 'Locomotive Failure' THEN c.Late_Trains ELSE NULL END) AS [Locomotive Failure],              
               SUM(CASE WHEN c.Late_Cause = 'Non-Locomotive Equipment Failure' THEN c.Late_Trains ELSE NULL END) AS [Non-Locomotive Equipment Failure], 
               SUM(CASE WHEN c.Late_Cause = 'Obstruction/Debris' THEN c.Late_Trains ELSE NULL END) AS [Obstruction Debris],              
               SUM(CASE WHEN c.Late_Cause = 'Other' THEN c.Late_Trains ELSE NULL END) AS [Other],                            
               SUM(CASE WHEN c.Late_Cause = 'Passenger Loading' THEN c.Late_Trains ELSE NULL END) AS [Passenger Loading],
               SUM(CASE WHEN c.Late_Cause = 'Passenger Train Interference' THEN c.Late_Trains ELSE NULL END) AS [Passenger Train Interference],     
               SUM(CASE WHEN c.Late_Cause = 'Sick, Injured, Unruly Passenger' THEN c.Late_Trains ELSE NULL END) AS [Sick Injured Unruly Passenger],
               SUM(CASE WHEN c.Late_Cause = 'Signal/Switch Failure' THEN c.Late_Trains ELSE NULL END) AS [Signal Switch Failure],           
               SUM(CASE WHEN c.Late_Cause = 'Track Work' THEN c.Late_Trains ELSE NULL END) AS [Track Work],                      
               SUM(CASE WHEN c.Late_Cause = 'Weather' THEN c.Late_Trains ELSE NULL END) AS [Weather]            FROM Causes c 
      GROUP BY CAST(strftime('%Y', c.Report_Month) AS INT);"

wide_sql <- dbGetQuery(conn, sql)

scroll_box(kable_styling(knitr::kable(wide_sql),
                         bootstrap_options = c("striped", "hover", "condensed")), width = "100%")
Year Accident Catenary Freight Interference Peak Freight Interference Off Peak Human Error Lift Deployment Locomotive Failure Non-Locomotive Equipment Failure Obstruction Debris Other Passenger Loading Passenger Train Interference Sick Injured Unruly Passenger Signal Switch Failure Track Work Weather
2009 522 116 688 1040 1058 510 1202 402 798 538 2736 608 788 2798 1616 2150
2010 1600 196 1754 3054 2212 1082 2476 1144 1468 1158 4130 1484 1670 5290 2746 2946
2011 1338 80 990 2272 1740 902 1320 486 802 792 4290 988 1000 3296 2758 3094
2012 932 162 496 1474 1294 500 1086 326 848 638 2364 440 874 2506 1806 1262
2013 1150 338 642 1706 1508 410 1202 468 782 526 2276 404 768 3274 1328 2194
2014 1370 144 1336 2334 1470 428 1658 828 1100 568 1509 490 732 2536 1964 3322
2015 898 302 726 1360 1318 330 NA NA 870 488 1084 260 574 1770 1188 1950
2016 1088 200 618 1066 1270 290 NA NA 892 484 1082 308 814 2782 2000 1108
2017 1268 66 718 1294 1796 496 NA NA 1090 640 1162 266 712 2446 1896 1194
2018 814 154 1112 1916 1990 640 NA NA 1230 562 1324 444 776 3478 1570 2108
scroll_box(kable_styling(kable(cor(wide_sql[-1], use = "complete.obs", method="pearson")),
                         bootstrap_options = c("striped", "hover", "condensed")), width = "100%")
Accident Catenary Freight Interference Peak Freight Interference Off Peak Human Error Lift Deployment Locomotive Failure Non-Locomotive Equipment Failure Obstruction Debris Other Passenger Loading Passenger Train Interference Sick Injured Unruly Passenger Signal Switch Failure Track Work Weather
Accident 1.0000000 0.1435415 0.7806773 0.9608999 0.8976619 0.5824678 0.7215697 0.7642611 0.6828030 0.6668736 0.3463388 0.5872245 0.5954141 0.6114195 0.6652174 0.6677474
Catenary 0.1435415 1.0000000 -0.1255482 0.0037574 0.1328661 -0.2913271 0.0296204 0.0639429 0.0278461 -0.1184067 -0.2985693 -0.2269049 -0.0105223 0.2158549 -0.5200273 -0.2466628
Freight Interference Peak 0.7806773 -0.1255482 1.0000000 0.9091988 0.8020245 0.6661686 0.9547198 0.9741564 0.9170038 0.7587071 0.3725510 0.7711136 0.7310978 0.7209778 0.7028399 0.7798693
Freight Interference Off Peak 0.9608999 0.0037574 0.9091988 1.0000000 0.9399653 0.7224539 0.8670727 0.8811132 0.8227255 0.8051196 0.4622986 0.7564155 0.7459639 0.7342568 0.7756815 0.7157045
Human Error 0.8976619 0.1328661 0.8020245 0.9399653 1.0000000 0.8356280 0.8342163 0.7914287 0.7573302 0.8979099 0.6539403 0.8494920 0.8652815 0.8833295 0.7699593 0.5505740
Lift Deployment 0.5824678 -0.2913271 0.6661686 0.7224539 0.8356280 1.0000000 0.7013289 0.5799427 0.5998879 0.9445498 0.9267498 0.9752919 0.9003905 0.8317744 0.9168026 0.4295133
Locomotive Failure 0.7215697 0.0296204 0.9547198 0.8670727 0.8342163 0.7013289 1.0000000 0.9781492 0.9757232 0.8446837 0.4127356 0.8198952 0.8583588 0.8499444 0.6355407 0.5798980
Non-Locomotive Equipment Failure 0.7642611 0.0639429 0.9741564 0.8811132 0.7914287 0.5799427 0.9781492 1.0000000 0.9622677 0.7347822 0.2636146 0.7106868 0.7395528 0.7528201 0.5662177 0.6705569
Obstruction Debris 0.6828030 0.0278461 0.9170038 0.8227255 0.7573302 0.5998879 0.9757232 0.9622677 1.0000000 0.7937764 0.2714363 0.7237340 0.8117931 0.7594222 0.5631391 0.4725877
Other 0.6668736 -0.1184067 0.7587071 0.8051196 0.8979099 0.9445498 0.8446837 0.7347822 0.7937764 1.0000000 0.7778087 0.9605269 0.9844827 0.9054438 0.8402201 0.3593215
Passenger Loading 0.3463388 -0.2985693 0.3725510 0.4622986 0.6539403 0.9267498 0.4127356 0.2636146 0.2714363 0.7778087 1.0000000 0.8556134 0.7359260 0.7011243 0.7977969 0.2684835
Passenger Train Interference 0.5872245 -0.2269049 0.7711136 0.7564155 0.8494920 0.9752919 0.8198952 0.7106868 0.7237340 0.9605269 0.8556134 1.0000000 0.9415782 0.8986873 0.8649865 0.4857091
Sick Injured Unruly Passenger 0.5954141 -0.0105223 0.7310978 0.7459639 0.8652815 0.9003905 0.8583588 0.7395528 0.8117931 0.9844827 0.7359260 0.9415782 1.0000000 0.9448665 0.7375805 0.2765033
Signal Switch Failure 0.6114195 0.2158549 0.7209778 0.7342568 0.8833295 0.8317744 0.8499444 0.7528201 0.7594222 0.9054438 0.7011243 0.8986873 0.9448665 1.0000000 0.6072113 0.3542022
Track Work 0.6652174 -0.5200273 0.7028399 0.7756815 0.7699593 0.9168026 0.6355407 0.5662177 0.5631391 0.8402201 0.7977969 0.8649865 0.7375805 0.6072113 1.0000000 0.5830179
Weather 0.6677474 -0.2466628 0.7798693 0.7157045 0.5505740 0.4295133 0.5798980 0.6705569 0.4725877 0.3593215 0.2684835 0.4857091 0.2765033 0.3542022 0.5830179 1.0000000
sql <- "SELECT CAST(strftime('%Y', c.Report_Month) AS INT) AS Year,
               c.Late_Cause,
               SUM(c.Late_Trains) AS Total_Late_Trains
        FROM Causes c
        INNER JOIN Lines l ON c.LineID = l.ID
        WHERE c.Late_Cause IN 
              (SELECT DISTINCT sub_c.Late_Cause 
               FROM Causes AS sub_c 
               WHERE strftime('%Y', sub_c.Report_Month) = '2012')
          AND c.Late_Cause <> 'TOTAL TRAINS DELAYED'
        GROUP BY strftime('%Y', c.Report_Month),
                 c.Late_Cause"

agg_sql <- dbGetQuery(conn, sql)

kable_styling(knitr::kable(agg_sql[sample(1:nrow(agg_sql), 20),]),
              bootstrap_options = c("striped", "hover", "condensed"))
Year Late_Cause Total_Late_Trains
102 2014 Weather 3322
58 2012 Lift Deployment 500
73 2013 Freight Interference - Total 2348
29 2010 Passenger Loading 4130
26 2010 Non-Locomotive Equipment Failure 1144
66 2012 Signal/Switch Failure 2506
97 2014 Passenger Loading 1509
67 2012 Track Work 1806
113 2015 Passenger Train Interference 260
128 2016 Passenger Train Interference 308
53 2012 Catenary Failure 162
98 2014 Passenger Train Interference 490
90 2014 Freight Interference - Total 3670
60 2012 Non-Locomotive Equipment Failure 326
17 2009 Weather 2150
33 2010 Track Work 2746
63 2012 Passenger Loading 2364
114 2015 Sick, Injured, Unruly Passenger 574
93 2014 Locomotive Failure 1658
46 2011 Passenger Loading 4290
ggplot(subset(agg_sql, Late_Cause != 'Freight Interference - Total'),
              aes(Year, Total_Late_Trains, fill=Late_Cause)) + 
  geom_bar(position = "fill", stat = "identity") +
  labs(title="Metra Late Trains by Cause", x="Year", y="Late Trains") +
  guides(fill=guide_legend(ncol=4)) +
  scale_x_continuous("year", breaks=unique(agg_sql$Year)) +
  scale_y_continuous(labels = scales::percent_format()) +
  scale_fill_manual(values=seaborn_palette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))

# DISCONNECT FROM DATABASE
dbDisconnect(conn)


Conclusion: Relational Databases in R